knitr::opts_chunk$set(fig.align="center")
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr)
library(ggplot2)
library(magrittr)
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)
theme_set(theme_light())
source('helper_functions.R')
In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).
Given a search algorithm (bfs or dfs), an oracle (CompassQL or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle, as well as participant group (student or professional) has any meaningful impact on a user’s rating for these metrics.
Our weakly-informative prior (normal(0.26, 1.26)) was derived from pilot studies, and it summarizes the user rating for each metric. Because our pilot study was small, we chose to aggregate our data (rather than deriving separate priors for each metric) to minimize the effect of biases.
Since ratings can have values between -2 and 2 inclusive, we perform ordinal regression.
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")
user_response_data <- read.csv('data/ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
user_response_data$oracle<- gsub('compassql', 'CompassQL', user_response_data$oracle)
user_response_data$oracle<- gsub('dziban', 'Dziban', user_response_data$oracle)
user_response_data$search<- gsub('bfs', 'BFS', user_response_data$search)
user_response_data$search<- gsub('dfs', 'DFS', user_response_data$search)
user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
mutate(
dataset = as.factor(dataset),
oracle = as.factor(oracle),
search = as.factor(search),
task = as.factor(task)
)
models <- list()
search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
participant_group_differences <- list()
seed = 12
models$confidence_udata <- brm(
formula = bf(confidence.udata ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/confidence_udata",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_udata)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.udata ~ oracle * search + dataset + task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 288)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 72)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.12 0.16 0.83 1.45 1.00 888 1378
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.14 0.43 -2.98 -1.34 1.00 1223
## Intercept[2] -0.95 0.41 -1.75 -0.17 1.00 1142
## Intercept[3] 1.25 0.41 0.46 2.05 1.00 1192
## oracleDziban -0.00 0.44 -0.87 0.88 1.00 855
## searchDFS -0.65 0.42 -1.49 0.15 1.00 930
## datasetmovies 0.16 0.30 -0.41 0.76 1.00 1230
## task2.RetrieveValue 0.27 0.20 -0.14 0.66 1.00 2184
## task3.Prediction 0.12 0.20 -0.27 0.52 1.00 2585
## task4.Exploration 0.57 0.21 0.16 0.96 1.00 2068
## participant_groupstudent 0.19 0.30 -0.38 0.76 1.00 1091
## oracleDziban:searchDFS 0.82 0.61 -0.36 1.96 1.00 845
## Tail_ESS
## Intercept[1] 1613
## Intercept[2] 1763
## Intercept[3] 1575
## oracleDziban 1302
## searchDFS 1488
## datasetmovies 1604
## task2.RetrieveValue 2192
## task3.Prediction 2310
## task4.Exploration 2031
## participant_groupstudent 1747
## oracleDziban:searchDFS 1524
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$confidence_udata)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_udata,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
confidence_udata_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_udata, NULL, "Oracle/Search Combination", "Rating")
confidence_udata_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
confidence_udata_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 1.11 0.792 1.42 0.95 mean qi
## 2 BFS Dziban 1.11 0.792 1.42 0.95 mean qi
## 3 DFS CompassQL 0.794 0.444 1.10 0.95 mean qi
## 4 DFS Dziban 1.19 0.889 1.49 0.95 mean qi
## 5 BFS CompassQL 1.11 1 1.21 0.5 mean qi
## 6 BFS Dziban 1.11 1 1.22 0.5 mean qi
## 7 DFS CompassQL 0.794 0.694 0.903 0.5 mean qi
## 8 DFS Dziban 1.19 1.08 1.29 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA)
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "search", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_udata$plot
Differences in user score by oracle.
oracle_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "oracle", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_udata$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
confidence_udata_predictive_data_subset <- subset(confidence_udata_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data_subset, "alg", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_udata$plot
Differences in user score by participant group
participant_group_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "participant_group", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_udata$plot
models$confidence_ans <- brm(
formula = bf(confidence.ans ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/confidence_ans",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_ans)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.ans ~ oracle * search + dataset + task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 288)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 72)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.56 0.13 0.31 0.82 1.00 826 1264
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -3.28 0.41 -4.09 -2.49 1.00 2880
## Intercept[2] -2.46 0.31 -3.08 -1.86 1.00 2848
## Intercept[3] -1.61 0.29 -2.19 -1.04 1.00 3037
## Intercept[4] 0.16 0.27 -0.37 0.70 1.00 2751
## oracleDziban 0.23 0.27 -0.30 0.76 1.00 2231
## searchDFS 0.06 0.27 -0.46 0.58 1.00 2490
## datasetmovies -0.16 0.19 -0.53 0.22 1.00 2179
## task2.RetrieveValue -0.29 0.20 -0.69 0.11 1.00 3629
## task3.Prediction -1.08 0.21 -1.48 -0.68 1.00 3447
## task4.Exploration -0.69 0.20 -1.08 -0.30 1.00 3787
## participant_groupstudent 0.13 0.20 -0.27 0.53 1.00 2701
## oracleDziban:searchDFS -0.04 0.38 -0.78 0.72 1.00 2197
## Tail_ESS
## Intercept[1] 1879
## Intercept[2] 2088
## Intercept[3] 2300
## Intercept[4] 2254
## oracleDziban 2340
## searchDFS 2382
## datasetmovies 2254
## task2.RetrieveValue 2275
## task3.Prediction 2568
## task4.Exploration 2632
## participant_groupstudent 2006
## oracleDziban:searchDFS 1791
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$confidence_ans)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_ans,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
confidence_ans_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_ans, NULL, "Oracle/Search Combination", "Rating")
confidence_ans_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
confidence_ans_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 1.06 0.764 1.32 0.95 mean qi
## 2 BFS Dziban 1.20 0.903 1.46 0.95 mean qi
## 3 DFS CompassQL 1.09 0.778 1.38 0.95 mean qi
## 4 DFS Dziban 1.21 0.931 1.44 0.95 mean qi
## 5 BFS CompassQL 1.06 0.958 1.15 0.5 mean qi
## 6 BFS Dziban 1.20 1.11 1.29 0.5 mean qi
## 7 DFS CompassQL 1.09 1 1.19 0.5 mean qi
## 8 DFS Dziban 1.21 1.12 1.29 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA)
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "search", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_ans$plot
Differences in user score by oracle.
oracle_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "oracle", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_ans$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
confidence_ans_predictive_data_subset <- subset(confidence_ans_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data_subset, "alg", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_ans$plot
Differences in user score by participant group
participant_group_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "participant_group", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_ans$plot
filename = "efficiency"
models$efficiency <- brm(
formula = bf(efficiency ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/efficiency",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$efficiency)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: efficiency ~ oracle * search + dataset + task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 288)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 72)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.13 0.15 0.87 1.46 1.00 1024 1546
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.65 0.43 -3.50 -1.82 1.00 1207
## Intercept[2] -1.05 0.40 -1.83 -0.26 1.00 1179
## Intercept[3] -0.22 0.40 -1.01 0.59 1.00 1174
## Intercept[4] 1.04 0.40 0.25 1.86 1.00 1179
## oracleDziban -0.13 0.43 -0.98 0.72 1.00 856
## searchDFS -1.26 0.43 -2.09 -0.44 1.00 964
## datasetmovies 0.18 0.30 -0.40 0.79 1.00 1266
## task2.RetrieveValue -0.27 0.18 -0.62 0.08 1.00 3519
## task3.Prediction 0.26 0.19 -0.10 0.63 1.00 4036
## task4.Exploration 0.38 0.19 0.01 0.76 1.00 3832
## participant_groupstudent 0.16 0.31 -0.47 0.79 1.00 1234
## oracleDziban:searchDFS 0.81 0.60 -0.38 1.98 1.00 918
## Tail_ESS
## Intercept[1] 1370
## Intercept[2] 1451
## Intercept[3] 1476
## Intercept[4] 1613
## oracleDziban 1517
## searchDFS 1315
## datasetmovies 1662
## task2.RetrieveValue 2602
## task3.Prediction 2581
## task4.Exploration 2477
## participant_groupstudent 1335
## oracleDziban:searchDFS 1601
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$efficiency)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$efficiency,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
efficiency_plot <- user_response_posterior_draws_plot(user_response_data, models$efficiency, NULL, "Oracle/Search Combination", "Rating")
efficiency_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
efficiency_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.797 0.264 1.29 0.95 mean qi
## 2 BFS Dziban 0.693 0.166 1.18 0.95 mean qi
## 3 DFS CompassQL -0.262 -0.764 0.278 0.95 mean qi
## 4 DFS Dziban 0.318 -0.25 0.834 0.95 mean qi
## 5 BFS CompassQL 0.797 0.625 0.972 0.5 mean qi
## 6 BFS Dziban 0.693 0.528 0.875 0.5 mean qi
## 7 DFS CompassQL -0.262 -0.444 -0.0833 0.5 mean qi
## 8 DFS Dziban 0.318 0.125 0.514 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA)
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "search", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$efficiency$plot
Differences in user score by oracle.
oracle_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "oracle", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$efficiency$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
efficiency_predictive_data_data_subset <- subset(efficiency_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data_data_subset, "alg", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$efficiency$plot
Differences in user score by participant group
participant_group_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "participant_group", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$efficiency$plot
models$ease_of_use <- brm(
formula = bf(ease.of.use ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/ease_of_use",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$ease_of_use)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: ease.of.use ~ oracle * search + dataset + task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 288)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 72)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.05 0.14 0.79 1.36 1.00 1050 1608
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.68 0.42 -3.55 -1.88 1.00 1334
## Intercept[2] -1.25 0.37 -1.95 -0.53 1.00 1221
## Intercept[3] -0.30 0.37 -1.01 0.42 1.00 1189
## Intercept[4] 1.62 0.38 0.90 2.35 1.00 1276
## oracleDziban -0.32 0.41 -1.13 0.48 1.00 962
## searchDFS -1.27 0.40 -2.07 -0.48 1.00 953
## datasetmovies 0.31 0.29 -0.24 0.88 1.00 1018
## task2.RetrieveValue 0.19 0.19 -0.18 0.56 1.00 3365
## task3.Prediction 0.27 0.18 -0.09 0.62 1.00 2785
## task4.Exploration 0.38 0.19 0.02 0.74 1.00 3326
## participant_groupstudent 0.47 0.29 -0.10 1.03 1.00 1071
## oracleDziban:searchDFS 0.79 0.56 -0.28 1.91 1.00 902
## Tail_ESS
## Intercept[1] 1616
## Intercept[2] 1643
## Intercept[3] 1771
## Intercept[4] 1565
## oracleDziban 1405
## searchDFS 1502
## datasetmovies 1490
## task2.RetrieveValue 2390
## task3.Prediction 2373
## task4.Exploration 2453
## participant_groupstudent 1417
## oracleDziban:searchDFS 1451
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$ease_of_use)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$ease_of_use,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
ease_of_use_plot <- user_response_posterior_draws_plot(user_response_data, models$ease_of_use, NULL, "Oracle/Search Combination", "Rating")
ease_of_use_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
ease_of_use_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.933 0.555 1.28 0.95 mean qi
## 2 BFS Dziban 0.738 0.319 1.11 0.95 mean qi
## 3 DFS CompassQL 0.0732 -0.403 0.528 0.95 mean qi
## 4 DFS Dziban 0.418 -0.0417 0.806 0.95 mean qi
## 5 BFS CompassQL 0.933 0.806 1.06 0.5 mean qi
## 6 BFS Dziban 0.738 0.611 0.875 0.5 mean qi
## 7 DFS CompassQL 0.0732 -0.0833 0.236 0.5 mean qi
## 8 DFS Dziban 0.418 0.264 0.569 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA)
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "search", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$ease_of_use$plot
Differences in user score by oracle.
oracle_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "oracle", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$ease_of_use$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
ease_of_use_predictive_data_subset <- subset(ease_of_use_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data_subset, "alg", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$ease_of_use$plot
Differences in user score by participant group
participant_group_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "participant_group", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$ease_of_use$plot
models$utility <- brm(
formula = bf(utility ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/utility",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$utility)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: utility ~ oracle * search + dataset + task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 288)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 72)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.95 0.14 0.70 1.24 1.00 968 1571
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -1.77 0.36 -2.48 -1.07 1.00 1434
## Intercept[2] -0.61 0.35 -1.30 0.10 1.00 1435
## Intercept[3] 0.05 0.35 -0.62 0.75 1.00 1440
## Intercept[4] 1.38 0.35 0.71 2.11 1.00 1328
## oracleDziban 0.02 0.37 -0.68 0.75 1.00 1075
## searchDFS -0.83 0.37 -1.60 -0.14 1.00 947
## datasetmovies 0.22 0.27 -0.29 0.75 1.00 961
## task2.RetrieveValue -0.16 0.18 -0.50 0.18 1.00 3620
## task3.Prediction 0.35 0.18 -0.01 0.71 1.00 3280
## task4.Exploration 0.59 0.19 0.24 0.96 1.00 3558
## participant_groupstudent 0.20 0.26 -0.30 0.72 1.00 1314
## oracleDziban:searchDFS 0.44 0.53 -0.58 1.48 1.00 979
## Tail_ESS
## Intercept[1] 1586
## Intercept[2] 1493
## Intercept[3] 1468
## Intercept[4] 1557
## oracleDziban 1594
## searchDFS 1305
## datasetmovies 1228
## task2.RetrieveValue 2528
## task3.Prediction 2193
## task4.Exploration 1858
## participant_groupstudent 1678
## oracleDziban:searchDFS 1486
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$utility)
s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$utility,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
utility_plot <- user_response_posterior_draws_plot(user_response_data, models$utility, NULL, "Oracle/Search Combination", "Rating")
utility_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
utility_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.624 0.111 1.11 0.95 mean qi
## 2 BFS Dziban 0.640 0.111 1.12 0.95 mean qi
## 3 DFS CompassQL -0.148 -0.694 0.375 0.95 mean qi
## 4 DFS Dziban 0.288 -0.25 0.819 0.95 mean qi
## 5 BFS CompassQL 0.624 0.458 0.792 0.5 mean qi
## 6 BFS Dziban 0.640 0.472 0.819 0.5 mean qi
## 7 DFS CompassQL -0.148 -0.333 0.0417 0.5 mean qi
## 8 DFS Dziban 0.288 0.111 0.472 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA)
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$utility <- user_response_diff_plot(utility_predictive_data, "search", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$utility$plot
Differences in user score by oracle.
oracle_differences$utility <- user_response_diff_plot(utility_predictive_data, "oracle", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$utility$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
utility_predictive_data_subset <- subset(utility_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$utility <- user_response_diff_plot(utility_predictive_data_subset, "alg", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$utility$plot
Differences in user score by participant group
participant_group_differences$utility <- user_response_diff_plot(utility_predictive_data, "participant_group", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$utility$plot
models$overall <- brm(
formula = bf(overall ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/overall",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$overall)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: overall ~ oracle * search + dataset + task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 288)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 72)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.41 0.18 1.09 1.79 1.00 1128 1715
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.83 0.53 -3.84 -1.76 1.00 918
## Intercept[2] -1.46 0.49 -2.43 -0.44 1.00 869
## Intercept[3] -0.22 0.48 -1.15 0.76 1.00 878
## Intercept[4] 1.96 0.50 1.00 3.00 1.00 933
## oracleDziban 0.01 0.52 -1.04 1.01 1.00 713
## searchDFS -0.98 0.53 -2.07 0.02 1.00 799
## datasetmovies -0.06 0.36 -0.80 0.65 1.00 740
## task2.RetrieveValue -0.09 0.19 -0.46 0.26 1.00 3886
## task3.Prediction 0.39 0.19 0.02 0.76 1.00 3125
## task4.Exploration 0.61 0.20 0.22 1.01 1.00 3503
## participant_groupstudent 0.48 0.36 -0.21 1.22 1.00 982
## oracleDziban:searchDFS 0.70 0.75 -0.75 2.21 1.01 672
## Tail_ESS
## Intercept[1] 1519
## Intercept[2] 1347
## Intercept[3] 1197
## Intercept[4] 1338
## oracleDziban 967
## searchDFS 1243
## datasetmovies 1009
## task2.RetrieveValue 2836
## task3.Prediction 2396
## task4.Exploration 2506
## participant_groupstudent 1286
## oracleDziban:searchDFS 1023
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$overall)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$overall,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
overall_plot <- user_response_posterior_draws_plot(user_response_data, models$overall, NULL, "Oracle/Search Combination", "Rating")
overall_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
overall_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.768 0.319 1.15 0.95 mean qi
## 2 BFS Dziban 0.775 0.361 1.14 0.95 mean qi
## 3 DFS CompassQL 0.182 -0.333 0.639 0.95 mean qi
## 4 DFS Dziban 0.621 0.181 1.03 0.95 mean qi
## 5 BFS CompassQL 0.768 0.639 0.917 0.5 mean qi
## 6 BFS Dziban 0.775 0.639 0.917 0.5 mean qi
## 7 DFS CompassQL 0.182 0.0139 0.361 0.5 mean qi
## 8 DFS Dziban 0.621 0.486 0.778 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA)
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$overall <- user_response_diff_plot(overall_predictive_data, "search", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$overall$plot
Differences in user score by oracle.
oracle_differences$overall <- overall_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"
oracle_differences$overall %>%
ggplot(aes(x = diff_in_rating, y = "overall")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences$overall <- user_response_diff_plot(overall_predictive_data, "oracle", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$overall$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
overall_predictive_data_subset <- subset(overall_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$overall <- user_response_diff_plot(overall_predictive_data_subset, "alg", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$overall$plot
Differences in user score by participant group
participant_group_differences$overall <- user_response_diff_plot(overall_predictive_data, "participant_group", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$overall$plot
Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.
combined_search_differences <- rbind(
search_differences$confidence_udata$differences,
search_differences$confidence_ans$differences,
search_differences$efficiency$differences,
search_differences$ease_of_use$differences,
search_differences$utility$differences,
search_differences$overall$differences)
search_difference_plots_intervals <- user_response_diff_summary(combined_search_differences, 'search')
search_difference_plots_intervals$plot_confidence
View intervals
search_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: search [1]
## search metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS - DFS Answer -0.0224 -0.417 0.361 0.95 mean qi
## 2 BFS - DFS Understanding Data 0.118 -0.278 0.528 0.95 mean qi
## 3 BFS - DFS Answer -0.0224 -0.167 0.111 0.5 mean qi
## 4 BFS - DFS Understanding Data 0.118 -0.0278 0.25 0.5 mean qi
search_difference_plots_intervals$plot_preference
View intervals
search_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: search [1]
## search metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS - DFS Overall 0.370 -0.139 0.889 0.95 mean qi
## 2 BFS - DFS Utility 0.562 -0.0833 1.22 0.95 mean qi
## 3 BFS - DFS Ease of Use 0.590 0.0833 1.11 0.95 mean qi
## 4 BFS - DFS Efficiency 0.717 0.0833 1.33 0.95 mean qi
## 5 BFS - DFS Overall 0.370 0.194 0.556 0.5 mean qi
## 6 BFS - DFS Utility 0.562 0.333 0.778 0.5 mean qi
## 7 BFS - DFS Ease of Use 0.590 0.417 0.778 0.5 mean qi
## 8 BFS - DFS Efficiency 0.717 0.5 0.944 0.5 mean qi
combined_oracle_differences <- rbind(
oracle_differences$confidence_udata$differences,
oracle_differences$confidence_ans$differences,
oracle_differences$efficiency$differences,
oracle_differences$ease_of_use$differences,
oracle_differences$utility$differences,
oracle_differences$overall$differences)
oracle_difference_plots_intervals <- user_response_diff_summary(combined_oracle_differences, 'oracle')
oracle_difference_plots_intervals$plot_confidence
View intervals
oracle_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Dziban - Comp… Answer 0.125 -0.25 0.528 0.95 mean qi
## 2 Dziban - Comp… Understandin… 0.196 -0.194 0.583 0.95 mean qi
## 3 Dziban - Comp… Answer 0.125 0 0.25 0.5 mean qi
## 4 Dziban - Comp… Understandin… 0.196 0.0556 0.333 0.5 mean qi
oracle_difference_plots_intervals$plot_preference
View intervals
oracle_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: oracle [1]
## oracle metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Dziban - Compass… Overall 0.223 -0.278 0.75 0.95 mean qi
## 2 Dziban - Compass… Utility 0.226 -0.417 0.861 0.95 mean qi
## 3 Dziban - Compass… Ease of U… 0.0752 -0.444 0.583 0.95 mean qi
## 4 Dziban - Compass… Efficiency 0.238 -0.417 0.889 0.95 mean qi
## 5 Dziban - Compass… Overall 0.223 0.0556 0.389 0.5 mean qi
## 6 Dziban - Compass… Utility 0.226 0 0.444 0.5 mean qi
## 7 Dziban - Compass… Ease of U… 0.0752 -0.111 0.25 0.5 mean qi
## 8 Dziban - Compass… Efficiency 0.238 0.0278 0.472 0.5 mean qi
combined_alg_differences <- rbind(
alg_differences$confidence_udata$differences,
alg_differences$confidence_ans$differences,
alg_differences$efficiency$differences,
alg_differences$ease_of_use$differences,
alg_differences$utility$differences,
alg_differences$overall$differences)
alg_difference_plots_intervals <- user_response_diff_summary(combined_alg_differences, 'alg')
alg_difference_plots_intervals$plot_confidence
View intervals
alg_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: alg [1]
## alg metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS Dziban - DF… Answer 0.103 -0.444 0.667 0.95 mean qi
## 2 BFS Dziban - DF… Understandi… 0.314 -0.278 0.889 0.95 mean qi
## 3 BFS Dziban - DF… Answer 0.103 -0.111 0.278 0.5 mean qi
## 4 BFS Dziban - DF… Understandi… 0.314 0.111 0.5 0.5 mean qi
alg_difference_plots_intervals$plot_preference
View intervals
alg_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: alg [1]
## alg metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS Dziban - DFS … Overall 0.593 -0.167 1.33 0.95 mean qi
## 2 BFS Dziban - DFS … Utility 0.788 -0.111 1.72 0.95 mean qi
## 3 BFS Dziban - DFS … Ease of … 0.665 -0.0556 1.44 0.95 mean qi
## 4 BFS Dziban - DFS … Efficien… 0.955 0.0556 1.83 0.95 mean qi
## 5 BFS Dziban - DFS … Overall 0.593 0.333 0.833 0.5 mean qi
## 6 BFS Dziban - DFS … Utility 0.788 0.5 1.11 0.5 mean qi
## 7 BFS Dziban - DFS … Ease of … 0.665 0.389 0.944 0.5 mean qi
## 8 BFS Dziban - DFS … Efficien… 0.955 0.667 1.28 0.5 mean qi
combined_participant_group_differences <- rbind(
participant_group_differences$confidence_udata$differences,
participant_group_differences$confidence_ans$differences,
participant_group_differences$efficiency$differences,
participant_group_differences$ease_of_use$differences,
participant_group_differences$utility$differences,
participant_group_differences$overall$differences)
participant_group_difference_plots_intervals <- user_response_diff_summary(combined_participant_group_differences, 'participant_group')
participant_group_difference_plots_intervals$plot_confidence
View intervals
participant_group_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: participant_group [1]
## participant_group metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 student - profess… Answer 0.0752 -0.319 0.481 0.95 mean qi
## 2 student - profess… Understa… 0.0878 -0.294 0.469 0.95 mean qi
## 3 student - profess… Answer 0.0752 -0.0563 0.206 0.5 mean qi
## 4 student - profess… Understa… 0.0878 -0.0438 0.219 0.5 mean qi
participant_group_difference_plots_intervals$plot_preference
View intervals
participant_group_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: participant_group [1]
## participant_group metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 student - professi… Overall 0.278 -0.231 0.825 0.95 mean qi
## 2 student - professi… Utility 0.183 -0.456 0.831 0.95 mean qi
## 3 student - professi… Ease of… 0.318 -0.2 0.837 0.95 mean qi
## 4 student - professi… Efficie… 0.133 -0.513 0.794 0.95 mean qi
## 5 student - professi… Overall 0.278 0.1 0.45 0.5 mean qi
## 6 student - professi… Utility 0.183 -0.0375 0.400 0.5 mean qi
## 7 student - professi… Ease of… 0.318 0.144 0.494 0.5 mean qi
## 8 student - professi… Efficie… 0.133 -0.0938 0.356 0.5 mean qi
##Histograms for Response Distributions Here we plot out the number of responses for each rating (-2 to 2 inclusive) across all of our user metrics (Confidence in Understanding Data, Confidence in Answer, Efficiency, Ease of Use, Utility, and Overall). Because each user completed 4 tasks, the total number of responses in these graphs is four times the total number of users in our study.
user_response_data$dataset<- gsub('birdstrikes', 'Birdstrikes', user_response_data$dataset)
user_response_data$dataset<- gsub('movies', 'Movies', user_response_data$dataset)
user_response_data %>%
ggplot(aes(x=confidence.udata)) +
geom_bar() +
xlab("Confidence in Understanding Data Rating") +
ylab("Number of Responses") +
facet_grid(dataset ~ search+oracle)
user_response_data %>%
ggplot(aes(x=confidence.ans)) +
geom_bar() +
xlab("Confidence in Answer Rating") +
ylab("Number of Responses") +
facet_grid(dataset ~ search+oracle)
user_response_data %>%
ggplot(aes(x=efficiency)) +
geom_bar() +
xlab("Efficiency Rating") +
ylab("Number of Responses") +
facet_grid(dataset ~ search+oracle)
user_response_data %>%
ggplot(aes(x=ease.of.use)) +
geom_bar() +
xlab("Ease of Use Rating") +
ylab("Number of Responses") +
facet_grid(dataset ~ search+oracle)
user_response_data %>%
ggplot(aes(x=utility)) +
geom_bar() +
xlab("Utility Rating") +
ylab("Number of Responses") +
facet_grid(dataset ~ search+oracle)
user_response_data %>%
ggplot(aes(x=overall)) +
geom_bar() +
xlab("Overall Rating") +
ylab("Number of Responses") +
facet_grid(dataset ~ search+oracle)